##########################################
### Analysis: Counterfactual Models ######
##########################################

## Purpose: This script includes the 
## visuals and models used to construct 
## our effect and levels counterfactual plots.
## Data In: 
## 1) data_files/datafull_12_5_2023.rds
## 2) data_files/Frank_WID_2015.dta
## 3) data_files/Frank_Gini_2015.dta
## 4) data_files/wid_data.csv
## 5) data_files/cps_pop_estimates.rds
## 6) data_files/cps_pop_estimates_5_year_bin.rds"
## Data Out:
## 1) figures/overall_plot.pdf
## 2) figures/kink_point_viz
## 3) figures/counterfactual_baseline_change
## 4) figures/counterfactual_effects_party_1992
## 5) figures/counterfactual_effects_party_5_year_bins
## 6) figures/counterfactual_effects_party_1990
## 7) figures/counterfactual_effects_party_1992_indep_ref
## 8) figures/counterfactual_levels_1992
## 9) figures/counterfactual_levels_1992_party_ses
## Software Versions: 

## R version 4.4.1

library(tidyverse) ## 2.0.0
library(MASS) ## 7.3.60.2
library(readstata13) ## 0.10.1
library(openxlsx) ## 4.2.6.1
library(viridis) ## 0.6.5
library(RColorBrewer) ## 1.1.3
library(diagis) ## 0.2.3

## set working directory to 
## replication package

data.full <- readRDS("data_files/datafull_5_6_2025.rds")

## before reading in these files
## follow the instructions in the README
## for downloading the 
## United States state level gini 
## and top income share datasets
income_share_data <- read.dta13("data_files/Frank_WID_2015.dta") 
gini_data <- read.dta13("data_files/Frank_Gini_2015.dta")


## before reading in this file 
## follow the instructions in the README
## for downloading the U.S. wealth inequality data.
saez <- read.csv("data_files/wid_data.csv") 

## CPS population estimates (weights
## constructed in 02_create_datasets.R file)
weights <- readRDS("data_files/cps_pop_estimates.rds")
weights_5 <- readRDS("data_files/cps_pop_estimates_5_year_bin.rds")

## colorblind palette for figures
cbbPalette <- c("#000000", "#E69F00", 
                "#56B4E9", "#009E73", 
                "#F0E442", "#0072B2", 
                "#D55E00", "#CC79A7")


'%ni%' <- Negate("%in%")

## weighted proportion function
weighted_proportion <- function(vec, value, weight){
  num <- (vec == value)*weight
  num <- sum(num)
  denom <- sum(weight)
  prop <- num / denom
  return(prop)
}


##############################
###### Overall Graph #########
##############################

## Here is the code to create
## Figure 1 from the main text,
## which displays our estimates for
## the percent of respondents who
## agree the "rich are getting richer",
## weighted by post-stratification weights to
## be more representative of the US
## population. We overlay these
## estimates with estimates of actual
## inequality in the United States,
## both income and wealth inequality.

## formatting data on top
## 10% share of income in the 
## United States
income_share <- income_share_data %>% 
  filter(State == "United States") 
income_share <- income_share %>% 
  mutate(Top10_adj = Top10_adj/100)

## formatting data on top
## 10% share of wealth in the 
## United States
saez <- saez[2:nrow(saez),]
colnames(saez)[c(2,3)] <- c("Year", "Wealth10")
saez$Year <- as.numeric(saez$Year)
saez$Wealth10 <- as.numeric(saez$Wealth10)

## joining the two datasets
income_share <- left_join(income_share, saez, by = "Year")
income_share <- income_share %>% dplyr::select(Year, Wealth10, Top10_adj)

income_share <- income_share %>% gather(key = "Measure", value = "mean", -Year)

income_share$Measure <- dplyr::recode(income_share$Measure,
                              `Top10_adj` = "Top 10% share of income",
                              `Wealth10` = "Top 10% share of wealth")
colnames(income_share)[1] <- "year"
income_share$year <- as.Date(paste0(income_share$year,
                            "-01-01"))

data.full$year_date <- as.Date(paste0(data.full$year,
                                      "-01-01"))

## estimating percent of respondents
## who agree the rich are getting richer,
## weighted by post-stratification weights
## to be more representative of US 
## adult population
overall_mean <- data.full %>%
  group_by(year_date) %>%
  summarise(mean = stats::weighted.mean(inequality.num, w = weight.fulldata),
            se = weighted_se(inequality.num,
                             w = weight.fulldata),
            lower = mean + qnorm(.025)*se,
            upper = mean + qnorm(.975)*se)

colnames(overall_mean)[1] <- "year"

## Figure 1: 
ggplot(overall_mean,
      aes(x = year,
          y = mean)) +
  geom_point(mapping = aes(color = "Weighted year means")) +
  geom_ribbon(mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              fill = cbbPalette[c(3)]) +
  geom_smooth(se = FALSE,
              mapping = aes(color = "Loess smoothing curve")) +
  theme_bw() +
  geom_line(data = income_share,
            aes(linetype = Measure)) +
  facet_wrap(~Measure) +
  coord_cartesian(xlim = c(as.Date("1966-01-01"), 
                           as.Date("2013-01-01"))) +
  scale_y_continuous(NULL, limits = c(0,.83),
                     labels = scales::percent_format()) +
  scale_color_manual(name = "Inequality Perceptions", values = 
                       cbbPalette[c(2:3)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.direction = "vertical", legend.position = "bottom") +
  labs(x = NULL)
## figures/overall_plot.pdf
## 5x7

###########################
## Harris vs. Pew Pattern #
###########################

## Here we create a visualization
## of the percent of respondents who
## agree that the "rich are getting richer"
## broken out by survey Harris vs. Pew.
## We include this visualization in section H.

## variable for whether Harris or Pew survey
data.full$harris <- ifelse(grepl("Pew", data.full$study),
                           "Pew", "Harris")

## estimating means. In this case
## we don't use survey weights
## as these were created at the year level
## (combining across surveys).

overall_mean <- data.full %>%
  group_by(year_date, harris) %>%
  summarise(mean = mean(inequality.num),
            se = sqrt(mean * (1-mean)/(n()-1)),
            lower = mean + qnorm(.025)*se,
            upper = mean + qnorm(.975)*se)

colnames(overall_mean)[1] <- "year"


ggplot(overall_mean,
       aes(x = year,
           color = harris,
           fill = harris,
           y = mean)) +
  geom_point() +
  geom_line() +
  geom_ribbon(mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              colour = "white") +
  theme_bw() +
  coord_cartesian(xlim = c(as.Date("1966-01-01"), 
                           as.Date("2013-01-01"))) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = 
                       cbbPalette[c(2:3)]) +
  scale_fill_manual(values = 
                       cbbPalette[c(2:3)]) +
  theme(legend.position = "bottom") +
  labs(y = 'Percent of Respondents Who Agree\n"Rich are Getting Richer"',
       fill = NULL,
       color = NULL,
       x = NULL)
## figures/overall_plot_study.pdf


#################################################
#### Plot to Determine Counterfactual Point ######
##################################################

## In this section we determine the "kinkpoint"
## we use for our counterfactual analyses: what
## year should we set at the baseline. For the 
## counterfactual effects plot (figure 5) we set
## the average "effect" of being a republic vis-a-vis
## Democrat at the average difference observed in that
## year, and for the counterfactual levels plot
## (figure 6) we set the distribution of Republicans,
## Democrats, and Indep/Other to the distribution
## observed in that year. 

## We use a piecewise regression and visual inspect
## to determine the kinkpoint. The visual inspection
## is included in the SI, section C. 



## unique years in data
breaks <- unique(data.full$year)[order(unique(data.full$year))]
breaks <- breaks[breaks > 1988 & breaks < 1993]

## piecewise regression
## for each regression we calculate the
## mean squared error
mse <- list()
for(i in 1:length(breaks)){
  piecewise1 <- lm(inequality.num ~ year*(year < breaks[i]) + 
                     year*(year>=breaks[i]), data = data.full)
  mse[i] <- summary(piecewise1)[6]
}

## We choose the kinkpoint based
## on the year with the lowest mean squared
## error: 1992
mse <- as.numeric(mse)
breaks[which(mse==min(mse))]

## We visualize this trend with the 
## following visualization of estimated values
## for identified republicans, democrats, and
## Indep/Other by eyar. 

party_lm <- lm(inequality.num ~ factor(year)*party.harm + 
                 ses + cohort2 + race2 + gender, data = data.full)
vcov <- vcov(party_lm)
betas <- coef(party_lm)

set.seed(08540)
beta.draws <- MASS::mvrnorm(n = 10000,
                            mu = betas,
                            Sigma = vcov)
matrix <- matrix(0, nrow = length(unique(data.full$year_date))*3,
                 ncol = length(betas))
matrix[,1] <- 1
colnames(matrix) <- names(betas)

## years in matrix
for(i in 2:37){
  matrix[i,i] <- 1
  matrix[i+37,i] <- 1
  matrix[i+74,i] <- 1
}

## first 37 entries: Republicans
## second 37 entries: Democrats
## third 37 entries: Indep/Other
matrix[38:(38+36), "party.harmDemocrat"] <- 1
matrix[75:nrow(matrix), "party.harmIndep/Other"] <- 1

## year interactions
for(i in 2:37){
  matrix[i+37,i+49] <- 1
  matrix[i+74,i+85] <- 1
}

## other variables set at medians
matrix[, "ses3"] <- 1 ## median ses
## gender male
matrix[, "gendermale"] <- 1

sim <- base::apply(beta.draws, 1, function(x){
  vector <- matrix %*% x
  return(vector)
})

estimates <- base::apply(sim, 1, function(x){
  mean <- mean(x)
  lower <- quantile(x, .025)
  upper <- quantile(x, .975)
  frame <- data.frame(mean = mean,
                      lower = lower,
                      upper = upper)
  return(frame)
})
estimates <- bind_rows(estimates)
estimates$party <- c(rep("Republican", 37),
                     rep("Democrat", 37),
                     rep("Indep/Other", 37))
estimates$year <- levels(factor(data.full$year))
estimates$year <- as.Date(paste0(estimates$year,
                                 "-01-01"))


ggplot(estimates,
       aes(x = year,
           y = mean)) +
  geom_point(mapping = aes(color = party)) +
  geom_line(mapping = aes(color = party)) +
  geom_errorbar(mapping = aes(ymin = lower,
                              ymax = upper,
                              color = party),
                width = 0) +
  geom_ribbon(data = overall_mean,
              mapping = aes(fill = "Weighted\nOverall Mean",
                            ymin = lower,
                            ymax = upper),
              alpha = .4) +
  theme_bw() +
  scale_color_manual(values = c("#2c7fb8",
                                "#31a354",
                                "#f03b20")) +
  scale_fill_manual(values = c("#9ebcda")) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(x = NULL,
       color = NULL,
       fill = NULL,
       y = "Percent of Respondents\nWho Agree the ``Rich Get Richer''") +
  geom_vline(mapping = aes(xintercept = as.Date("1992-01-01")),
             lty = 2) +
  annotate(x = as.Date("1980-1-1"),
           y = .45,
           geom = "label",
           label = "1992 is last year\nprior to asymmetric\nRepublican polarization") +
  annotate(x = as.Date("1980-1-1"),
           xend = as.Date("1992-1-1"),
           y = .5,
           yend = .55,
           geom = "curve",
           curvature = -.3,
           arrow = arrow(length = unit(2, units = "mm"))) +
  coord_cartesian(ylim = c(.4, 1))
##figures/kink_point_viz
## 5x8.5

## gap 1990 vs. 1992
estimates$mean[estimates$party == "Democrat" &
                 estimates$year == "1992-01-01"] -
  estimates$mean[estimates$party == "Republican" &
                   estimates$year == "1992-01-01"]
## .144
estimates$mean[estimates$party == "Democrat" &
                 estimates$year == "1990-01-01"] -
  estimates$mean[estimates$party == "Republican" &
                   estimates$year == "1990-01-01"]
## .206

##############################
### Choosing Baseline ########
##############################

## As we discuss below, for our counterfactual
## analysis we needed to choose which partisan
## group would stand in as the "baseline". An
## ideal baseline would be one which the group
## had a fairly flat trend line post-1992, otherwise
## it would be challenging to disentangle the 
## changing gap vs the changing reference point.
## This analysis is included in the SI 
## Section C. 

## This plot estimates the difference
## between the estimated values and the 1992
## baseline for each party group. 

## 1992 estimates
plot2 <- estimates %>%
  group_by(party) %>%
  filter(year == "1992-01-01") %>%
  dplyr::select(mean, party)
colnames(plot2)[1] <- "mean_1992"

## join with estimates
estimates <- left_join(estimates,
                       plot2,
                       by = c("party"))

## take difference between observed
## and 1992 estimates by year
estimates$diff <- estimates$mean -
  estimates$mean_1992


ggplot(estimates[estimates$year >= as.Date("1992-01-01"),],
       mapping = aes(x = year,
                     y = diff,
                     color = party)) +
  geom_point(alpha =.5) +
  geom_line() +
  geom_hline(mapping = aes(yintercept = 0),
             lty = 2) +
  scale_color_manual(values = c("#2c7fb8",
                                "#31a354",
                                "#f03b20")) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(color = NULL,
       x = NULL,
       y = "Percentage Point Difference in Estimated Values\nBetween 1992 Baseline")

## figures/counterfactual_baseline_change
## 5x7

######################################################
####### Effects Counterfactual: Annual, 1992 #########
######################################################

## This code replicates the counterfactual
## effects plot displayed in the main text,
## Figure 5.

## counterfactual: what if republicans hadn't had their
## big decline vis a vis democrats:
## what would the overall aggregate trend line look like? 
## we're going to estimate this by predicting values for
## all individuals under two scenarios:
## a) observed scenario: we allow identified republicans
## to have their observed asymmetrical decline
## b) counterfactual scenario: we force identified republicans
## to maintain the difference in perceptions with democrats 
## that we observed
## prior to the decline.

## We do this estimation with a non-parametric bootstrap.
## We use a model of inequality perceptions on 
## party identification,
## interacted with year fixed effects and with other controls.
## we use this model to get a prediction for each individual
## on the probability of perceiving inequality 
## under two scenarios: 
## a) observed differences between democrats and republicans
## democrats and indep/other by year
## b) counterfactual scenario where these differences 
## are set at their 1992 levels, the last year before 
## the beginning of asymmetric polarization

## We set democrats as the reference group in this 
## simulation. 
## Party is a factor variable, so one level (in this case
## democrats) is in the reference group, 
## We used democrats b/c they were fairly flat 
## in perceptions over the 1990s (see figure above).
## If we used republicans, for example, all
## groups would be artifically pulled downwards b/c of 
## republicans' asymmetric polarization.

### Simulation Steps:
### 1) We create 10,000 versions of our dataset 
## by re-sampling (with replacement).
### 2) For each dataset, we run our model, and create
### a prediction for each individual in the dataset 
### with two versions of the model:
### a) one using the observed model,
### b) the other a counterfactual model 
### where we set the difference
## between democrats and republicans/indep-other 
## at the estimated
## level before the downturn. 

## This approach include both deterministic
## and stochastic components 
## (see King, Tomz, Wittenberg, 2000),
## We include uncertainty on the 
## coefficients with our non-parametric bootstrap,
## and uncertainty on fundamental error (e) by also 
## simulating an error e_i for each individual.
## We assume each e_i is drawn
## from a normal distribution centered at 0,
## with standard deviation sigma (the later estimated
## by our model). 

## 3) For each dataset with the model-based predictions
## we calculate aggregate averages by year,
## weighted with re-constructed post-stratification weights 
## (we need to reconstruct the weights because otherwise in the
## re-sampled datasets the weights would not sum to one). 

## 4) Across ALL draws, we take the mean (our point estimates 
## for each simulation) and include confidence intervals with the
## .025 and .975 quantiles 

data.boot <- data.full %>%
  filter(year > 1991)

data.boot$party.harm <- factor(data.boot$party.harm,
                               levels = c("Democrat",
                                          "Indep/Other",
                                          "Republican"))

set.seed(97405)
means_annual_92 <- list()
for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]
  model <- lm(inequality.num ~ +
                factor(year)*party.harm + ses + cohort2 + race2 + gender, 
              data = data.boot_c)
  model.m <- model.matrix(model)
  observed <- model.m %*% coef(model) 
  
  coef <- coef(model)
  coef[33:length(coef)] <- 0
  
  counter <- model.m %*% coef
  
  ## fundamental uncertainty - generating errors
  ## for each individual 
  e_i <- rnorm(nrow(model.m), mean = 0, sd = summary(model)$sigma)
  
  observed <- observed + e_i
  counter <- counter + e_i
  
  frame <- data.frame(observed = observed,
                      counter = counter,
                      year = data.boot_c$year_date,
                      weight_cat = data.boot_c$Year_Gender_Age_Edu)
  
  # recalculating weights
  ## 1) create counts by year 
  
  table <- data.boot_c %>%
    dplyr::group_by(year, Year_Gender_Age_Edu) %>%
    dplyr::summarize(survey_n = n())
  
  ## 2) calculate total n for each year
  study_n <- table %>%
    dplyr::group_by(year) %>%
    dplyr::summarize(total = sum(survey_n))
  table <- left_join(table, study_n, by = "year")
  
  ## 3) join weights and table
  table <- left_join(table, weights[, c("Year_Gender_Age_Edu",
                                        "Population")],
                     by = "Year_Gender_Age_Edu")
  
  ## 4) Create weights
  ## observed is the observed percent in our survey for that category
  table$observed <- table$survey_n/table$total
  
  ## Upweights category which were under observed 
  ## in data, downweights over observed categories
  table$weight_boot <- table$Population/table$observed
  
  ## merging with data
  data.boot_c$weight_boot <- table$weight_boot[
    match(data.boot_c$Year_Gender_Age_Edu,
          table$Year_Gender_Age_Edu)
  ]
  
  # trim weights to .3 to 3 
  data.boot_c$weight_boot[!is.na(data.boot_c$weight_boot) &
                             data.boot_c$weight_boot > 3] <- 3
  data.boot_c$weight_boot[is.na(data.boot_c$weight_boot) &
                              data.boot_c$weight_boot < .3] <- .3
  
  frame$weight_boot <- data.boot_c$weight_boot
  
  means_annual_92[[i]] <- frame %>%
    gather(key = "counterfactual",
           value = "percent",
           -year,
           -weight_boot,
           -weight_cat) %>%
    dplyr::group_by(year, counterfactual) %>%
    dplyr::summarize(mean = weighted.mean(percent,
                                          weight_boot))
  print(i)
  
}

means_annual_92 <- bind_rows(means_annual_92)

plot <- means_annual_92 %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_0 = mean(mean, na.rm = TRUE),
            lower = quantile(mean, .025, na.rm = TRUE),
            upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"


plot$counterfactual <- dplyr::recode(plot$counterfactual,
                                     `counter` = "Counterfactual estimate:\neffect of being\nrepublican/indep\nset at 1992",
                                     `observed` = "Observed estimate:\neffect of being\nrepublican/indep\nset at observed levels")

ggplot(plot,
       aes(x = year,
           y = mean)) +
  geom_smooth(data = overall_mean,
              se = FALSE,
              color = cbbPalette[5]) +
  geom_point(data = overall_mean,
             mapping = aes(color = "Weighted year\nmeans")) +
  geom_ribbon(data = overall_mean,
              mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              fill = cbbPalette[5]) +
  theme_bw() +
  geom_point(mapping = aes(color = counterfactual,
                           group = counterfactual,
                           y = mean)) +
  geom_line(mapping = aes(color = counterfactual,
                          group = counterfactual,
                          y = mean)) +
  geom_errorbar(mapping = aes(color = counterfactual,
                              group = counterfactual,
                              ymin = lower,
                              ymax = upper),
                width = 0) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_fill_manual(values = 
                      cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
## figures/counterfactual_effects_party_1992
## 5x7

## average difference post-1992 (referenced in main text)
plot %>%
  filter(year != "1992-01-01") %>%
  dplyr::group_by(year) %>%
  dplyr::summarize(diff = mean[1] - mean[2]) %>%
  dplyr::summarize(mean(diff))
## difference of 6.8%

## only 2003 or before (after that democrats increase) 
plot %>%
  filter(year != "1992-01-01" &
           year <= "2003-01-01") %>%
  dplyr::group_by(year) %>%
  dplyr::summarize(diff = mean[1] - mean[2]) %>%
  dplyr::summarize(mean(diff))
## difference of 3.8 percentage points 


################################################
####### Effects Counterfactual: 5 Year Bins ####
################################################

## Here we use 5-year bins rather than 
## yearly bins for our counterfactual effects
## simulaiton. These results are included in the 
## SI, section C. 

## for re-weighting
## create weight category based on 5-year bins
data.boot$Year_Gender_Age_Edu_5 <- gsub("^[0-9]{4,}_",
                                      "", data.boot$Year_Gender_Age_Edu)
data.boot$Year_Gender_Age_Edu_5 <- paste(data.boot$year_5,
                                         data.boot$Year_Gender_Age_Edu_5,
                                         sep = "_")

set.seed(97401)
means_5 <- list()

for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]
  model <- lm(inequality.num ~ +
                factor(year_5)*party.harm + ses + cohort2 + race2 + gender, 
              data = data.boot_c)
  model.m <- model.matrix(model)
  observed <- model.m %*% coef(model) 
  
  coef <- coef(model)
  coef[19:length(coef)] <- 0
  
  counter <- model.m %*% coef
  
  ## fundamental uncertainty - generating errors
  ## for each individual 
  e_i <- rnorm(nrow(model.m), mean = 0, 
               sd = summary(model)$sigma)
  
  observed <- observed + e_i
  counter <- counter + e_i
  
  frame <- data.frame(observed = observed,
                      counter = counter,
                      year = data.boot_c$year_5,
                      weight_cat = data.boot_c$Year_Gender_Age_Edu_5)
  
  # recalculating weights
  ## 1) create counts by year 
  
  table <- data.boot_c %>%
    dplyr::group_by(year_5, Year_Gender_Age_Edu_5) %>%
    dplyr::summarize(survey_n = n())
  
  ## 2) calculate total n for each year
  study_n <- table %>%
    dplyr::group_by(year_5) %>%
    dplyr::summarize(total = sum(survey_n))
  table <- left_join(table, study_n, by = "year_5")
  
  ## 3) join weights and table
  table <- left_join(table, weights_5[, c("Year_Gender_Age_Edu",
                                        "Population")],
                     by = c("Year_Gender_Age_Edu_5" = "Year_Gender_Age_Edu"))
  
  ## 4) Create weights
  ## observed is the observed percent in our survey for that category
  table$observed <- table$survey_n/table$total
  
  ## Upweights category which were under observed 
  ## in data, downweights over observed categories
  table$weight_boot <- table$Population/table$observed
  
  ## merging with data
  data.boot_c$weight_boot <- table$weight_boot[
    match(data.boot_c$Year_Gender_Age_Edu_5,
          table$Year_Gender_Age_Edu_5)
  ]
  
  # trim weights to .3 to 3 
  data.boot_c$weight_boot[!is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot > 3] <- 3
  data.boot_c$weight_boot[is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot < .3] <- .3
  
  frame$weight_boot <- data.boot_c$weight_boot
  
  means_5[[i]] <- frame %>%
    gather(key = "counterfactual",
           value = "percent",
           -year,
           -weight_boot,
           -weight_cat) %>%
    dplyr::group_by(year, counterfactual) %>%
    dplyr::summarize(mean = weighted.mean(percent,
                                          weight_boot))
  print(i)
  
}

means_5 <- bind_rows(means_5)

plot <- means_5 %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_o = mean(mean, na.rm = TRUE),
            lower = quantile(mean, .025, na.rm = TRUE),
            upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"


plot$counterfactual <- dplyr::recode(plot$counterfactual,
                                     `counter` = "Counterfactual estimate:\neffect of being\nrepublican/indep\nset at 1992,\n5 year bins",
                                     `observed` = "Observed estimate:\neffect of being\nrepublican/indep\nset at observed levels,\n5 year bins")
plot$year <- dplyr::recode(plot$year,
                           `1991-1995` = as.Date("1993-1-1"),
                           `1996-2000` = as.Date("1998-1-1"),
                           `2001-2005` = as.Date("2003-1-1"),
                           `2006-2010` = as.Date("2008-1-1"),
                           `2011-2013` = as.Date("2012-1-1"))


ggplot(plot,
       aes(x = year,
           y = mean)) +
  geom_smooth(data = overall_mean,
              se = FALSE,
              color = cbbPalette[5]) +
  geom_point(data = overall_mean,
             mapping = aes(color = "Weighted year\nmeans")) +
  geom_ribbon(data = overall_mean,
              mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              fill = cbbPalette[c(5)]) +
  theme_bw() +
  geom_point(mapping = aes(color = counterfactual,
                           group = counterfactual,
                           y = mean)) +
  geom_line(mapping = aes(color = counterfactual,
                          group = counterfactual,
                          y = mean)) +
  geom_errorbar(mapping = aes(color = counterfactual,
                              group = counterfactual,
                              ymin = lower,
                              ymax = upper),
                width = 0) +
  scale_y_continuous(labels = scales::percent_format(),
                     limits = c(0,1)) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
## figures/counterfactual_effects_party_5_year_bins
## 5x7


#############################################################
####### Effects Counterfactual: Annual, 1990 #################
#############################################################

## These results show what happens when we use
## an alernative kink point (1990). These results
## are included in the SI, section C.

data.boot <- data.full %>%
  filter(year > 1989)

data.boot$party.harm <- factor(data.boot$party.harm,
                               levels = c("Democrat",
                                          "Indep/Other",
                                          "Republican"))

set.seed(97403)
means_annual_90 <- list()
for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]
  model <- lm(inequality.num ~ +
                factor(year)*party.harm + ses + cohort2 + race2 + gender, 
              data = data.boot_c)
  model.m <- model.matrix(model)
  observed <- model.m %*% coef(model) 
  
  coef <- coef(model)
  coef[35:length(coef)] <- 0
  
  counter <- model.m %*% coef
  
  ## fundamental uncertainty - generating errors
  ## for each individual 
  e_i <- rnorm(nrow(model.m), mean = 0, 
               sd = summary(model)$sigma)
  
  observed <- observed + e_i
  counter <- counter + e_i
  
  frame <- data.frame(observed = observed,
                      counter = counter,
                      year = data.boot_c$year_date,
                      weight_cat = data.boot_c$Year_Gender_Age_Edu)
  
  # recalculating weights
  ## 1) create counts by year 
  
  table <- data.boot_c %>%
    dplyr::group_by(year, Year_Gender_Age_Edu) %>%
    dplyr::summarize(survey_n = n())
  
  ## 2) calculate total n for each year
  study_n <- table %>%
    dplyr::group_by(year) %>%
    dplyr::summarize(total = sum(survey_n))
  table <- left_join(table, study_n, by = "year")
  
  ## 3) join weights and table
  table <- left_join(table, weights[, c("Year_Gender_Age_Edu",
                                        "Population")],
                     by = "Year_Gender_Age_Edu")
  
  ## 4) Create weights
  ## observed is the observed percent in our survey for that category
  table$observed <- table$survey_n/table$total
  
  ## Upweights category which were under observed 
  ## in data, downweights over observed categories
  table$weight_boot <- table$Population/table$observed
  
  ## merging with data
  data.boot_c$weight_boot <- table$weight_boot[
    match(data.boot_c$Year_Gender_Age_Edu,
          table$Year_Gender_Age_Edu)
  ]
  
  # trim weights to .3 to 3 
  data.boot_c$weight_boot[!is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot > 3] <- 3
  data.boot_c$weight_boot[is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot < .3] <- .3
  
  frame$weight_boot <- data.boot_c$weight_boot
  
  means_annual_90[[i]] <- frame %>%
    gather(key = "counterfactual",
           value = "percent",
           -year,
           -weight_boot,
           -weight_cat) %>%
    dplyr::group_by(year, counterfactual) %>%
    dplyr::summarize(mean = weighted.mean(percent,
                                          weight_boot))
  print(i)
  
}


means_annual_90 <- bind_rows(means_annual_90)

plot <- means_annual_90 %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_o = mean(mean, na.rm = TRUE),
            lower = quantile(mean, .025, na.rm = TRUE),
            upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"


plot$counterfactual <- dplyr::recode(plot$counterfactual,
                                     `counter` = "Counterfactual estimate:\neffect of being\nrepublican/indep\nset at 1990",
                                     `observed` = "Observed estimate:\neffect of being\nrepublican/indep\nset at observed levels")

ggplot(plot,
       aes(x = year,
           y = mean)) +
  geom_smooth(data = overall_mean,
              se = FALSE,
              color = cbbPalette[5]) +
  geom_point(data = overall_mean,
             mapping = aes(color = "Weighted year\nmeans")) +
  geom_ribbon(data = overall_mean,
              mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              fill = cbbPalette[c(5)]) +
  theme_bw() +
  geom_point(mapping = aes(color = counterfactual,
                           group = counterfactual,
                           y = mean)) +
  geom_line(mapping = aes(color = counterfactual,
                          group = counterfactual,
                          y = mean)) +
  geom_errorbar(mapping = aes(color = counterfactual,
                              group = counterfactual,
                              ymin = lower,
                              ymax = upper),
                width = 0) +
  scale_y_continuous(labels = scales::percent_format(),
                     limits = c(0,1)) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
## figures/counterfactual_effects_party_1990
## 5x7

#############################################################
####### Effects Counterfactual: Annual, 1992, Indep/Other as Ref #################
#############################################################

## This code examines what happens when 
## we use an alternative reference group.
## These results are included in the SI, Section C. 

data.boot <- data.full %>%
  filter(year > 1991)

data.boot$party.harm <- factor(data.boot$party.harm,
                               levels = c("Indep/Other",
                                          "Republican",
                                          "Democrat"))

set.seed(97407)
means_annual_92 <- list()
for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]
  model <- lm(inequality.num ~ +
                factor(year)*party.harm + ses + cohort2 + race2 + gender, 
              data = data.boot_c)
  model.m <- model.matrix(model)
  observed <- model.m %*% coef(model) 
  
  coef <- coef(model)
  coef[33:length(coef)] <- 0
  
  counter <- model.m %*% coef
  
  ## fundamental uncertainty - generating errors
  ## for each individual 
  e_i <- rnorm(nrow(model.m), mean = 0, 
               sd = summary(model)$sigma)
  
  observed <- observed + e_i
  counter <- counter + e_i
  
  
  frame <- data.frame(observed = observed,
                      counter = counter,
                      year = data.boot_c$year_date,
                      weight_cat = data.boot_c$Year_Gender_Age_Edu)
  
  # recalculating weights
  ## 1) create counts by year 
  
  table <- data.boot_c %>%
    dplyr::group_by(year, Year_Gender_Age_Edu) %>%
    dplyr::summarize(survey_n = n())
  
  ## 2) calculate total n for each year
  study_n <- table %>%
    dplyr::group_by(year) %>%
    dplyr::summarize(total = sum(survey_n))
  table <- left_join(table, study_n, by = "year")
  
  ## 3) join weights and table
  table <- left_join(table, weights[, c("Year_Gender_Age_Edu",
                                        "Population")],
                     by = "Year_Gender_Age_Edu")
  
  ## 4) Create weights
  ## observed is the observed percent in our survey for that category
  table$observed <- table$survey_n/table$total
  
  ## Upweights category which were under observed 
  ## in data, downweights over observed categories
  table$weight_boot <- table$Population/table$observed
  
  ## merging with data
  data.boot_c$weight_boot <- table$weight_boot[
    match(data.boot_c$Year_Gender_Age_Edu,
          table$Year_Gender_Age_Edu)
  ]
  
  # trim weights to .3 to 3 
  data.boot_c$weight_boot[!is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot > 3] <- 3
  data.boot_c$weight_boot[is.na(data.boot_c$weight_boot) &
                            data.boot_c$weight_boot < .3] <- .3
  
  frame$weight_boot <- data.boot_c$weight_boot
  
  means_annual_92[[i]] <- frame %>%
    gather(key = "counterfactual",
           value = "percent",
           -year,
           -weight_boot,
           -weight_cat) %>%
    dplyr::group_by(year, counterfactual) %>%
    dplyr::summarize(mean = weighted.mean(percent,
                                          weight_boot))
  print(i)
  
}

means_annual_92 <- bind_rows(means_annual_92)

plot <- means_annual_92 %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_o = mean(mean, na.rm = TRUE),
            lower = quantile(mean, .025, na.rm = TRUE),
            upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"


plot$counterfactual <- dplyr::recode(plot$counterfactual,
                                     `counter` = "Counterfactual estimate:\neffect of being\nrepublican/indep\nset at 1992, Indep/Other Reference",
                                     `observed` = "Observed estimate:\neffect of being\nrepublican/indep\nset at observed levels")

ggplot(plot,
       aes(x = year,
           y = mean)) +
  geom_smooth(data = overall_mean,
              se = FALSE,
              color = cbbPalette[5]) +
  geom_point(data = overall_mean,
             mapping = aes(color = "Weighted year\nmeans")) +
  geom_ribbon(data = overall_mean,
              mapping = aes(ymin = lower,
                            ymax = upper),
              alpha = .2,
              fill = cbbPalette[c(5)]) +
  theme_bw() +
  geom_point(mapping = aes(color = counterfactual,
                           group = counterfactual,
                           y = mean)) +
  geom_line(mapping = aes(color = counterfactual,
                          group = counterfactual,
                          y = mean)) +
  geom_errorbar(mapping = aes(color = counterfactual,
                              group = counterfactual,
                              ymin = lower,
                              ymax = upper),
                width = 0) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
## figures/counterfactual_effects_party_1992_indep_ref
## 5x7

###########################################
######## Counterfactual Levels Graphs #####
###########################################

## In the main text, Figure 6 we display our
## counterfactual levels graph. 

## for the counterfactual levels simulation
## we examine the following question:
## what if we kept the distribution of 
## cohort and party at their 
## levels observed before the downturn - 
## could this help account
## for the down turn?

## we use our model to get a prediction for each individual
## in our data, and then weight the estimates under two different 
## weighting schemes:
## 1) weighting to population estimates for party or cohort distribution
## for that year (observed)
## 2) weighting to population estimates for 1992

## we take five steps:
## 1) We estimate using our post-stratification weights 
## the population distribution
## of our party and cohort variables for each year 
## 2) We identify the model with the lowest
## mean squared error out of the models considered
## 3) We create 1,000 bootstrapped datasets
## 4) For each bootstrapped dataset, we construct
## an observed and counterfactual weight for
## based on party-year and cohort-year population estimates:
## a) observed weight: weights means to be more representative 
## of population estimates for party/cohort for that year
## b) counterfactual weight: weights means to be more representative
## of population estimates from 1992 for party/cohort
## 5) For each bootstrapped dataset we get a model-based
## prediction of each individual in the dataset, and then
## calculate weighted year means based on the observed
## and counterfactual weights. 

## Note: We do three separate simulations, one for party identification
## distribution, one for cohort distribution, and one for the joint
## party-ses distribuiton. We do all three simulation within each
## bootstrapped dataset, so the code for all three is contained
## within a single for loop. 

### Step 1: 
### Estimate Population Estimates for Cohort/Party Distribution By Year:

## cohort (population estimates):
overall_cohort <- data.full %>%
  filter(!is.na(weight.fulldata)) %>%
  group_by(year_date) %>%
  summarise(Boomers = weighted_proportion(vec = cohort2,
                                          value = "Boomers",
                                          weight = weight.fulldata),
            `Before Greatest` = weighted_proportion(vec = cohort2,
                                                    value = "Before Greatest",
                                                    weight = weight.fulldata),
            `Greatest` = weighted_proportion(vec = cohort2,
                                             value = "Greatest",
                                             weight = weight.fulldata),
            `Silent` = weighted_proportion(vec = cohort2,
                                           value = "Silent",
                                           weight = weight.fulldata),
            `Gen-X` = weighted_proportion(vec = cohort2,
                                          value = "Gen-X",
                                          weight = weight.fulldata),
            `Millenials` = weighted_proportion(vec = cohort2,
                                               value = "Millenials",
                                               weight = weight.fulldata)) %>%
  gather(key = "Cohort",
         value = "Population Estimate",
         -year_date)

## party (population estimates):
overall_party <- data.full %>%
  filter(!is.na(weight.fulldata)) %>%
  group_by(year_date) %>%
  summarise(Democrat = weighted_proportion(vec = party.harm,
                                           value = "Democrat",
                                           weight = weight.fulldata),
            Republican = weighted_proportion(vec = party.harm,
                                             value = "Republican",
                                             weight = weight.fulldata),
            `Indep/Other` = weighted_proportion(vec = party.harm,
                                                value = "Indep/Other",
                                                weight = weight.fulldata)) %>%
  gather(key = "Party",
         value = "Population Estimate",
         -year_date)

## party-SES (population estimates)
data.full$party.harm_SES <- paste(data.full$party.harm,
                                  data.full$ses,
                                  sep = "_")

overall_party_ses <- data.full %>%
  filter(!is.na(weight.fulldata)) %>%
  group_by(year_date) %>%
  summarise(Democrat_1 = weighted_proportion(vec = party.harm_SES,
                                             value = "Democrat_1",
                                             weight = weight.fulldata),
            Democrat_2 = weighted_proportion(vec = party.harm_SES,
                                             value = "Democrat_2",
                                             weight = weight.fulldata),
            Democrat_3 = weighted_proportion(vec = party.harm_SES,
                                             value = "Democrat_3",
                                             weight = weight.fulldata),
            Democrat_4 = weighted_proportion(vec = party.harm_SES,
                                             value = "Democrat_4",
                                             weight = weight.fulldata),
            Democrat_5 = weighted_proportion(vec = party.harm_SES,
                                             value = "Democrat_5",
                                             weight = weight.fulldata),
            Republican_1 = weighted_proportion(vec = party.harm_SES,
                                               value = "Republican_1",
                                               weight = weight.fulldata),
            Republican_2 = weighted_proportion(vec = party.harm_SES,
                                               value = "Republican_2",
                                               weight = weight.fulldata),
            Republican_3 = weighted_proportion(vec = party.harm_SES,
                                               value = "Republican_3",
                                               weight = weight.fulldata),
            Republican_4 = weighted_proportion(vec = party.harm_SES,
                                               value = "Republican_4",
                                               weight = weight.fulldata),
            Republican_5 = weighted_proportion(vec = party.harm_SES,
                                               value = "Republican_5",
                                               weight = weight.fulldata),
            `Indep/Other_1` = weighted_proportion(vec = party.harm_SES,
                                                  value = "Indep/Other_1",
                                                  weight = weight.fulldata),
            `Indep/Other_2` = weighted_proportion(vec = party.harm_SES,
                                                  value = "Indep/Other_2",
                                                  weight = weight.fulldata),
            `Indep/Other_3` = weighted_proportion(vec = party.harm_SES,
                                                  value = "Indep/Other_3",
                                                  weight = weight.fulldata),
            `Indep/Other_4` = weighted_proportion(vec = party.harm_SES,
                                                  value = "Indep/Other_4",
                                                  weight = weight.fulldata),
            `Indep/Other_5` = weighted_proportion(vec = party.harm_SES,
                                                  value = "Indep/Other_5",
                                                  weight = weight.fulldata)) %>%
  gather(key = "Party_SES",
         value = "Population Estimate",
         -year_date)


## Step 2: Determine Model

## we use the model with lowest mse

data.boot <- data.full %>%
  filter(year > 1991)

model1 <- lm(inequality.num ~ +
               factor(year)*party.harm + ses + cohort2 + race2 + gender, 
             data = data.boot)
model2 <- lm(inequality.num ~ +
               factor(year) + party.harm + ses + cohort2 + race2 + gender, 
             data = data.boot)
model3 <- lm(inequality.num ~ +
               factor(year)*ses + party.harm + cohort2 + race2 +gender, 
             data = data.boot)
model4 <- lm(inequality.num ~ +
               factor(year)*cohort2 + party.harm + ses + race2 + gender, 
             data = data.boot)
model5 <- lm(inequality.num ~ +
               factor(year)*race2 + party.harm + ses + cohort2 + gender, 
             data = data.boot)
model6 <- lm(inequality.num ~ +
               factor(year)*party.harm*ses + race2 + cohort2 + gender, 
             data = data.boot)


mean(model1$residuals^2) 
mean(model2$residuals^2)
mean(model3$residuals^2)
mean(model4$residuals^2)
mean(model5$residuals^2)
mean(model6$residuals^2) ## this is the lowest

## Bootstrap (Steps 3-5)
data.boot$party.harm_SES <- paste(data.boot$party.harm,
                                  data.boot$ses,
                                  sep = "_")

set.seed(97402)
means_levels_92 <- list()
for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]
  model <- lm(inequality.num ~ +
                 factor(year)*party.harm*ses + race2 + cohort2 + gender, 
               data = data.boot_c)
  
  ## calculate observed distributions
  ## of each party, cohort, and party-ses grouping
  ## in bootstrapped dataset
  ## within sample estimates for cohort: 
  observed_cohort <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Boomers = mean(cohort2 == "Boomers"),
              `Before Greatest` = mean(cohort2 == "Before Greatest"),
              `Greatest` = mean(cohort2 == "Greatest"),
              `Silent` = mean(cohort2 == "Silent"),
              `Gen-X` = mean(cohort2 == "Gen-X"),
              `Millenials` = mean(cohort2 == "Millenials")) %>%
    gather(key = "Cohort",
           value = "Percent",
           -year_date)
  
  
  ## party (sample estimates):
  observed_party <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Democrat = mean(party.harm == "Democrat"),
              Republican = mean(party.harm == "Republican"),
              `Indep/Other` = mean(party.harm == "Indep/Other")) %>%
    gather(key = "Party",
           value = "Percent",
           -year_date)
  
  ## party (sample estimates):
  observed_party_ses <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Democrat_1 = mean(party.harm_SES == "Democrat_1"),
              Democrat_2 = mean(party.harm_SES == "Democrat_2"),
              Democrat_3 = mean(party.harm_SES == "Democrat_3"),
              Democrat_4 = mean(party.harm_SES == "Democrat_4"),
              Democrat_5 = mean(party.harm_SES == "Democrat_5"),
              Republican_1 = mean(party.harm_SES == "Republican_1"),
              Republican_2 = mean(party.harm_SES == "Republican_2"),
              Republican_3 = mean(party.harm_SES == "Republican_3"),
              Republican_4 = mean(party.harm_SES == "Republican_4"),
              Republican_5 = mean(party.harm_SES == "Republican_5"),
              `Indep/Other_1` = mean(party.harm_SES == "Indep/Other_1"),
              `Indep/Other_2` = mean(party.harm_SES == "Indep/Other_2"),
              `Indep/Other_3` = mean(party.harm_SES == "Indep/Other_3"),
              `Indep/Other_4` = mean(party.harm_SES == "Indep/Other_4"),
              `Indep/Other_5` = mean(party.harm_SES == "Indep/Other_5")) %>%
    gather(key = "Party_SES",
           value = "Percent",
           -year_date)
  
  ## merge by year and variable:
  observed_party <- left_join(observed_party,
                              overall_party,
                              by = c("year_date",
                                     "Party"))
  observed_party_ses <- left_join(observed_party_ses,
                                  overall_party_ses,
                                  by = c("year_date",
                                         "Party_SES"))
  observed_cohort <- left_join(observed_cohort,
                               overall_cohort,
                               by = c("year_date",
                                      "Cohort"))
  
  ## restrict to 1992 counterfactual levels
  overall_party_1992 <- overall_party %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_party_1992)[2] <- "Population Estimate 1992"
  
  overall_party_ses_1992 <- overall_party_ses %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_party_ses_1992)[2] <- "Population Estimate 1992"
  
  overall_cohort_1992 <- overall_cohort %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_cohort_1992)[2] <- "Population Estimate 1992"
  
  ## merge in counterfactual levels
  observed_party <- left_join(observed_party,
                              overall_party_1992,
                              by = c("Party"))
  
  observed_party_ses <- left_join(observed_party_ses,
                                  overall_party_ses_1992,
                                  by = c("Party_SES"))
  observed_cohort <- left_join(observed_cohort,
                               overall_cohort_1992,
                               by = c("Cohort"))
  
  
  ## creating observed and counterfactual survey weights -
  ## upweights data from category 
  ## which were under observed 
  ## in data, downweights over observed categories
  observed_party$weight_obs <- observed_party$`Population Estimate` /
    observed_party$Percent
  observed_party$weight_counter <- observed_party$`Population Estimate 1992` /
    observed_party$Percent
  
  observed_party_ses$weight_obs <- observed_party_ses$`Population Estimate` /
    observed_party_ses$Percent
  observed_party_ses$weight_counter <- observed_party_ses$`Population Estimate 1992` /
    observed_party_ses$Percent
  
  observed_cohort$weight_obs <- observed_cohort$`Population Estimate` /
    observed_cohort$Percent
  observed_cohort$weight_counter <- observed_cohort$`Population Estimate 1992` /
    observed_cohort$Percent
  
  ## trim weights
  observed_party$weight_counter[!is.na(observed_party$weight_counter) &
                                  observed_party$weight_counter > 3] <- 3
  observed_party$weight_counter[!is.na(observed_party$weight_counter) &
                                  observed_party$weight_counter < .3] <- .3
  observed_party_ses$weight_counter[!is.na(observed_party_ses$weight_counter) &
                                      observed_party_ses$weight_counter > 3] <- 3
  observed_party_ses$weight_counter[!is.na(observed_party_ses$weight_counter) &
                                      observed_party_ses$weight_counter < .3] <- .3
  
  observed_cohort$weight_counter[!is.na(observed_cohort$weight_counter) &
                                   observed_cohort$weight_counter > 3] <- 3
  observed_cohort$weight_counter[!is.na(observed_cohort$weight_counter) &
                                   observed_cohort$weight_counter < .3] <- .3
  
  ## merge weights to data
  colnames(observed_cohort)[c(6,7)] <- c("cohort_weight_observed",
                                         "cohort_weight_counterfactual")
  colnames(observed_party)[c(6,7)] <- c("party_weight_observed",
                                        "party_weight_counterfactual")
  
  colnames(observed_party_ses)[c(6,7)] <- c("party_ses_weight_observed",
                                            "party_ses_weight_counterfactual")
  
  
  data.boot_c <- left_join(data.boot_c,
                         observed_party[, c("Party",
                                            "year_date",
                                            "party_weight_observed",
                                            "party_weight_counterfactual")],
                         by = c("party.harm" = "Party",
                                "year_date"))
  
  data.boot_c <- left_join(data.boot_c,
                         observed_cohort[, c("Cohort",
                                             "year_date",
                                             "cohort_weight_observed",
                                             "cohort_weight_counterfactual")],
                         by = c("cohort2" = "Cohort",
                                "year_date"))
  
  data.boot_c <- left_join(data.boot_c,
                         observed_party_ses[, c("Party_SES",
                                                "year_date",
                                                "party_ses_weight_observed",
                                                "party_ses_weight_counterfactual")],
                         by = c("party.harm_SES" = "Party_SES",
                                "year_date"))
  
  ## generate prediction for each individual in bootstrapped
  ## data 
  m <- model.matrix(model)
  vals <- m %*% coef(model)
  
  ## fundamental uncertainty
  e_i <- rnorm(nrow(m), mean = 0, 
               sd = summary(model)$sigma)
  
  vals <- vals + e_i
  
  ## pull together predictions and weights
  frame <- data.frame(observed = vals,
                      year = data.boot_c$year_date,
                      cohort_weight_observed = data.boot_c$cohort_weight_observed,
                      cohort_weight_counterfactual = data.boot_c$cohort_weight_counterfactual,
                      party_weight_observed = data.boot_c$party_weight_observed,
                      party_weight_counterfactual = data.boot_c$party_weight_counterfactual,
                      party_ses_weight_observed = data.boot_c$party_ses_weight_observed,
                      party_ses_weight_counterfactual = data.boot_c$party_ses_weight_counterfactual)
  
  ## calculate weighted means
  means_levels_92[[i]] <- frame %>%
    dplyr::group_by(year) %>%
    dplyr::summarize(mean_cohort_observed = weighted.mean(observed, 
                                                          w = cohort_weight_observed),
                     mean_cohort_counterfactual = weighted.mean(observed, 
                                                                w = cohort_weight_counterfactual),
                     mean_party_observed = weighted.mean(observed, 
                                                         w = party_weight_observed),
                     mean_party_counterfactual = weighted.mean(observed, 
                                                               w = party_weight_counterfactual),
                     mean_party_ses_observed = weighted.mean(observed, 
                                                             w = party_ses_weight_observed),
                     mean_party_ses_counterfactual = weighted.mean(observed, 
                                                                   w = party_ses_weight_counterfactual))
  print(i)
  
}


means_levels_92 <- bind_rows(means_levels_92)

plot <- means_levels_92 %>%
  gather(key = "counterfactual",
         value = "mean",
         -year) %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_0 = mean(mean, na.rm = TRUE),
                   lower = quantile(mean, .025, na.rm = TRUE),
                   upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"

plot$simulation <- ifelse(grepl("cohort",
                            plot$counterfactual),
                      "Cohort",
                      ifelse(grepl("ses",
                                   plot$counterfactual),
                      "Party-SES", "Party"))
plot$level <- ifelse(grepl("counterfactual",
                                plot$counterfactual),
                          "Counterfactual Estimate:\nDistribution of Variable\nSet at 1992",
                     "Observed Estimate:\nDistribution of Variable\nSet at Observed Levels")


## The figure excluding the "Party-SES"
## simulation is included in the main text, Figure
## 6. The standalone "Party-SES" figure is included
## in the SI, section C. 

ggplot(plot[plot$simulation == "Party-SES", ],
       aes(x = year,
           y = mean,
           color = level,
           group = level,
           ymin = lower,
           ymax = upper)) +
  facet_wrap(~simulation) +
  theme_bw() +
  geom_point() +
  geom_line() +
  geom_errorbar(width = 0) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
## figures/counterfactual_levels_1992
## 5x7
## party-ses:
## figures/counterfactual_levels_1992_party_ses

###################################
### Alternative Level Simulation ##
####################################

## Here we use the same bootstrapping
## procedure, but we don't predict values
## for each individual and instead use the observed
## inequality.num balues in the individual
## bootstrapped samples. 
## As the results are largely the same
## we don't include these results in the text/SI,
## but we do reference them in the main text. 

set.seed(97402)
means_levels_92_alt <- list()
for(i in 1:1000){
  data.boot_c <- data.boot[sample(1:nrow(data.boot),
                                  nrow(data.boot),
                                  replace = TRUE), ]

  
  ## calculate observed distributions
  ## of each party, cohort, and party-ses grouping
  ## in bootstrapped dataset
  ## within sample estimates for cohort: 
  observed_cohort <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Boomers = mean(cohort2 == "Boomers"),
              `Before Greatest` = mean(cohort2 == "Before Greatest"),
              `Greatest` = mean(cohort2 == "Greatest"),
              `Silent` = mean(cohort2 == "Silent"),
              `Gen-X` = mean(cohort2 == "Gen-X"),
              `Millenials` = mean(cohort2 == "Millenials")) %>%
    gather(key = "Cohort",
           value = "Percent",
           -year_date)
  
  
  ## party (sample estimates):
  observed_party <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Democrat = mean(party.harm == "Democrat"),
              Republican = mean(party.harm == "Republican"),
              `Indep/Other` = mean(party.harm == "Indep/Other")) %>%
    gather(key = "Party",
           value = "Percent",
           -year_date)
  
  ## party (sample estimates):
  observed_party_ses <- data.boot_c %>%
    group_by(year_date) %>%
    summarise(Democrat_1 = mean(party.harm_SES == "Democrat_1"),
              Democrat_2 = mean(party.harm_SES == "Democrat_2"),
              Democrat_3 = mean(party.harm_SES == "Democrat_3"),
              Democrat_4 = mean(party.harm_SES == "Democrat_4"),
              Democrat_5 = mean(party.harm_SES == "Democrat_5"),
              Republican_1 = mean(party.harm_SES == "Republican_1"),
              Republican_2 = mean(party.harm_SES == "Republican_2"),
              Republican_3 = mean(party.harm_SES == "Republican_3"),
              Republican_4 = mean(party.harm_SES == "Republican_4"),
              Republican_5 = mean(party.harm_SES == "Republican_5"),
              `Indep/Other_1` = mean(party.harm_SES == "Indep/Other_1"),
              `Indep/Other_2` = mean(party.harm_SES == "Indep/Other_2"),
              `Indep/Other_3` = mean(party.harm_SES == "Indep/Other_3"),
              `Indep/Other_4` = mean(party.harm_SES == "Indep/Other_4"),
              `Indep/Other_5` = mean(party.harm_SES == "Indep/Other_5")) %>%
    gather(key = "Party_SES",
           value = "Percent",
           -year_date)
  
  ## merge by year and variable:
  observed_party <- left_join(observed_party,
                              overall_party,
                              by = c("year_date",
                                     "Party"))
  observed_party_ses <- left_join(observed_party_ses,
                                  overall_party_ses,
                                  by = c("year_date",
                                         "Party_SES"))
  observed_cohort <- left_join(observed_cohort,
                               overall_cohort,
                               by = c("year_date",
                                      "Cohort"))
  
  ## restrict to 1992 counterfactual levels
  overall_party_1992 <- overall_party %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_party_1992)[2] <- "Population Estimate 1992"
  
  overall_party_ses_1992 <- overall_party_ses %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_party_ses_1992)[2] <- "Population Estimate 1992"
  
  overall_cohort_1992 <- overall_cohort %>%
    filter(year_date == "1992-01-01") %>%
    dplyr::select(-year_date)
  colnames(overall_cohort_1992)[2] <- "Population Estimate 1992"
  
  ## merge in counterfactual levels
  observed_party <- left_join(observed_party,
                              overall_party_1992,
                              by = c("Party"))
  
  observed_party_ses <- left_join(observed_party_ses,
                                  overall_party_ses_1992,
                                  by = c("Party_SES"))
  observed_cohort <- left_join(observed_cohort,
                               overall_cohort_1992,
                               by = c("Cohort"))
  
  
  ## creating observed and counterfactual survey weights -
  ## upweights data from category 
  ## which were under observed 
  ## in data, downweights over observed categories
  observed_party$weight_obs <- observed_party$`Population Estimate` /
    observed_party$Percent
  observed_party$weight_counter <- observed_party$`Population Estimate 1992` /
    observed_party$Percent
  
  observed_party_ses$weight_obs <- observed_party_ses$`Population Estimate` /
    observed_party_ses$Percent
  observed_party_ses$weight_counter <- observed_party_ses$`Population Estimate 1992` /
    observed_party_ses$Percent
  
  observed_cohort$weight_obs <- observed_cohort$`Population Estimate` /
    observed_cohort$Percent
  observed_cohort$weight_counter <- observed_cohort$`Population Estimate 1992` /
    observed_cohort$Percent
  
  ## trim weights
  observed_party$weight_counter[!is.na(observed_party$weight_counter) &
                                  observed_party$weight_counter > 3] <- 3
  observed_party$weight_counter[!is.na(observed_party$weight_counter) &
                                  observed_party$weight_counter < .3] <- .3
  observed_party_ses$weight_counter[!is.na(observed_party_ses$weight_counter) &
                                      observed_party_ses$weight_counter > 3] <- 3
  observed_party_ses$weight_counter[!is.na(observed_party_ses$weight_counter) &
                                      observed_party_ses$weight_counter < .3] <- .3
  
  observed_cohort$weight_counter[!is.na(observed_cohort$weight_counter) &
                                   observed_cohort$weight_counter > 3] <- 3
  observed_cohort$weight_counter[!is.na(observed_cohort$weight_counter) &
                                   observed_cohort$weight_counter < .3] <- .3
  
  ## merge weights to data
  colnames(observed_cohort)[c(6,7)] <- c("cohort_weight_observed",
                                         "cohort_weight_counterfactual")
  colnames(observed_party)[c(6,7)] <- c("party_weight_observed",
                                        "party_weight_counterfactual")
  
  colnames(observed_party_ses)[c(6,7)] <- c("party_ses_weight_observed",
                                            "party_ses_weight_counterfactual")
  
  
  data.boot_c <- left_join(data.boot_c,
                           observed_party[, c("Party",
                                              "year_date",
                                              "party_weight_observed",
                                              "party_weight_counterfactual")],
                           by = c("party.harm" = "Party",
                                  "year_date"))
  
  data.boot_c <- left_join(data.boot_c,
                           observed_cohort[, c("Cohort",
                                               "year_date",
                                               "cohort_weight_observed",
                                               "cohort_weight_counterfactual")],
                           by = c("cohort2" = "Cohort",
                                  "year_date"))
  
  data.boot_c <- left_join(data.boot_c,
                           observed_party_ses[, c("Party_SES",
                                                  "year_date",
                                                  "party_ses_weight_observed",
                                                  "party_ses_weight_counterfactual")],
                           by = c("party.harm_SES" = "Party_SES",
                                  "year_date"))
  
  
  ## pull together observed values and weights
  frame <- data.frame(observed = data.boot_c$inequality.num,
                      year = data.boot_c$year_date,
                      cohort_weight_observed = data.boot_c$cohort_weight_observed,
                      cohort_weight_counterfactual = data.boot_c$cohort_weight_counterfactual,
                      party_weight_observed = data.boot_c$party_weight_observed,
                      party_weight_counterfactual = data.boot_c$party_weight_counterfactual,
                      party_ses_weight_observed = data.boot_c$party_ses_weight_observed,
                      party_ses_weight_counterfactual = data.boot_c$party_ses_weight_counterfactual)
  
  ## calculate weighted means
  means_levels_92_alt[[i]] <- frame %>%
    dplyr::group_by(year) %>%
    dplyr::summarize(mean_cohort_observed = weighted.mean(observed, 
                                                          w = cohort_weight_observed),
                     mean_cohort_counterfactual = weighted.mean(observed, 
                                                                w = cohort_weight_counterfactual),
                     mean_party_observed = weighted.mean(observed, 
                                                         w = party_weight_observed),
                     mean_party_counterfactual = weighted.mean(observed, 
                                                               w = party_weight_counterfactual),
                     mean_party_ses_observed = weighted.mean(observed, 
                                                             w = party_ses_weight_observed),
                     mean_party_ses_counterfactual = weighted.mean(observed, 
                                                                   w = party_ses_weight_counterfactual))
  print(i)
  
}

means_levels_92_alt <- bind_rows(means_levels_92_alt)

plot <- means_levels_92_alt %>%
  gather(key = "counterfactual",
         value = "mean",
         -year) %>%
  dplyr::group_by(year, counterfactual) %>%
  dplyr::summarize(mean_0 = mean(mean, na.rm = TRUE),
                   lower = quantile(mean, .025, na.rm = TRUE),
                   upper = quantile(mean, .975, na.rm = TRUE))
colnames(plot)[3] <- "mean"

plot$simulation <- ifelse(grepl("cohort",
                                plot$counterfactual),
                          "Cohort",
                          ifelse(grepl("ses",
                                       plot$counterfactual),
                                 "Party-SES", "Party"))
plot$level <- ifelse(grepl("counterfactual",
                           plot$counterfactual),
                     "Counterfactual Estimate:\nDistribution of Variable\nSet at 1992",
                     "Observed Estimate:\nDistribution of Variable\nSet at Observed Levels")


ggplot(plot[plot$simulation != "Party-SES", ],
       aes(x = year,
           y = mean,
           color = level,
           group = level,
           ymin = lower,
           ymax = upper)) +
  facet_wrap(~simulation) +
  theme_bw() +
  geom_point() +
  geom_line() +
  geom_errorbar(width = 0) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = 
                       cbbPalette[c(2,6,5)]) +
  scale_linetype_discrete(name = "Actual Inequality") +
  theme(legend.position = "bottom")  +
  labs(color = NULL,
       x = NULL,
       y = "Predicted Percent of Respondents\nWho Agree the ``Rich Get Richer''")
